%!TEX root = paper.tex

% \clearpage
% NBA figures
% \begin{figure*}[ht]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-prune-prec}
%    \includegraphics[width=0.3\textwidth]{figs/nba/prune_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-prune-recl}
%    \includegraphics[width=0.3\textwidth]{figs/nba/prune_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-prune-dist}
%    \includegraphics[width=0.3\textwidth]{figs/nba/prune_dist.eps}}
%  \caption{Vary object selection strategies.  Fix $\zeta = .05$,
%    $\eta^+ = \eta^- = .1$. (NBA)}
%  \label{fig:expr-prune}
% \end{figure*}
% 
% \begin{figure*}[ht]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-norm-prec}
%    \includegraphics[width=0.3\textwidth]{figs/nba/norm_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-norm-recl}
%    \includegraphics[width=0.3\textwidth]{figs/nba/norm_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-norm-dist}
%    \includegraphics[width=0.3\textwidth]{figs/nba/norm_dist.eps}}
%  \caption{Vary L-norms in neighborhood definition.  Fix $\zeta = .05$,
%    $\eta^+ = \eta^- = .1$. (NBA)}
%  \label{fig:expr-norm}
% \end{figure*}

\begin{figure*}[ht]
 \centering
 \subfloat[Precision]{\label{fig:expr-budget-sample-prec}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_sample_prec.eps}}
 \subfloat[Recall]{\label{fig:expr-budget-sample-recl}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_sample_recl.eps}}
 \subfloat[Sketch Distance]{\label{fig:expr-budget-sample-dist}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_sample_dist.eps}}
 \caption{\textbf{(NBA)} Vary sample rate $\zeta$ between 1-10\%.
   Fix $\eta^+ = \eta^- = .1$.}
 \label{fig:expr-budget-sample}
\end{figure*}

\begin{figure*}[ht]
 \centering
 \subfloat[Precision]{\label{fig:expr-budget-eta-pos-prec}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_eta_pos_prec.eps}}
 \subfloat[Recall]{\label{fig:expr-budget-eta-pos-recl}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_eta_pos_recl.eps}}
 \subfloat[Sketch Distance]{\label{fig:expr-budget-eta-pos-dist}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_eta_pos_dist.eps}}
 \caption{\textbf{(NBA)} Vary budget $\eta^+$ for $\Obj^+$ between 2-20\%.
   Fix $\zeta = .05$, $\eta^- = .1$.}
 \label{fig:expr-budget-eta-pos}
\end{figure*}

\begin{figure*}[ht]
 \centering
 \subfloat[Precision]{\label{fig:expr-budget-eta-neg-prec}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_eta_neg_prec.eps}}
 \subfloat[Recall]{\label{fig:expr-budget-eta-neg-recl}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_eta_neg_recl.eps}}
 \subfloat[Sketch Distance]{\label{fig:expr-budget-eta-neg-dist}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_eta_neg_dist.eps}}
 \caption{\textbf{(NBA)} Vary budget $\eta^-$ for $\Obj^-$ between 2-20\%.
   Fix $\zeta = .05$, $\eta^+ = .1$.}
 \label{fig:expr-budget-eta-neg}
\end{figure*}

\begin{figure*}[ht]
 \centering
 \subfloat[Projection]{\label{fig:expr-budget-tot-prj}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_tot_prj.eps}}
 \subfloat[Count]{\label{fig:expr-budget-tot-cnt}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_tot_cnt.eps}}
 \subfloat[Streak]{\label{fig:expr-budget-tot-stk}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_tot_stk.eps}}
 \caption{\textbf{(NBA)} Fix total budget $\eta = .2$ for $\Obj^+$ and
   $\Obj^-$, vary $\eta^+$ for $\Obj^+$ between 2-18\%.}
 \label{fig:expr-budget-tot}
\end{figure*}


\begin{figure*}[t]
\begin{minipage}[b]{0.64\linewidth}
 \centering
 \subfloat[Uniform sample]
  {\label{fig:expr-dblp-sample-unif}
   \includegraphics[width=0.48\linewidth]
    {figs/dblp/unif_smpl/budget_sample_recl.eps}}
 \subfloat[Sample rate proportional to objects' data size]
  {\label{fig:expr-dblp-sample-prop}
   \includegraphics[width=0.48\linewidth]
    {figs/dblp/prop_smpl/budget_sample_recl.eps}}
 \caption{\textbf{(DBLP)} Compare recall of $\RePreProx$ with two
   different sampling strategies, at varying sample rate 1-10\%,
   fixing $\eta^+ = \eta^- = .1$.}
 \label{fig:expr-dblp-sample}
\end{minipage}
\hfill
\begin{minipage}[b]{0.32\linewidth}
 \centering
 \includegraphics[width=\linewidth]{figs/wiki/budget_eta_pos_recl.eps}
 \caption{\textbf{(WIKI)} Recall of $\RePreProx$.  Vary $\eta^+$
   2-20\%.  Fix $\zeta=.05, \eta^-=.1$} \label{fig:expr-wiki-type}
\end{minipage}
\end{figure*}


\begin{figure*}[ht]
 \centering
 \subfloat[Vary $\zeta$. Fix $\eta^+ = \eta^- = .1$.]
   {\label{fig:expr-time-budget-sample}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_sample_time.eps}}
 \subfloat[Vary $\eta^+$. Fix $\zeta = .05$, $\eta^- = .1$.]
   {\label{fig:expr-time-budget-pos}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_eta_pos_time.eps}}
 \subfloat[Vary $\eta^-$. Fix $\zeta = .05$, $\eta^+ = .1$.]
   {\label{fig:expr-time-budget-neg}
   \includegraphics[width=0.3\textwidth]{figs/nba/budget_eta_neg_time.eps}}
 \caption{\textbf{(NBA)} Efficiency of algorithms varying sample rate and budget.}
 \label{fig:expr-time-budget}
\end{figure*}

\section{Experiments}
\label{sec:expr}

We implemented our algorithms in C++ and evaluated the efficiency and result quality on three real datasets. All experiments are conducted on a machine with Intel Core i7-2600 3.4GHz CPU and 7.8GB RAM.

\subsection{Datasets}
\label{sec:expr:data}

{\bf NBA players' game-by-game performance statistics (NBA)\footnote{\url{http://www.basketball-reference.com}}:}
   By considering a player as an object and his performance
   stats in each game as a tuple, the dataset consists of 1.01
   million tuples ($\|\DataSet\| \approx 1.01 \times 10^6$) for a
   total of 3,129 players ($n = 3129$).

{\bf DBLP\footnote{\url{http://dblp.uni-trier.de}}:}
   For the DBLP data set, we consider authors as objects, and
   year-by-year performance of an author in terms of the author's
   numbers of publications in different venues; i.e., a tuple
   represents an author's performance in a given year, where
   attributes represent the author's publication counts in different
   venues in that year.  This dataset consists of $\sim$67k
   tuples ($\|\DataSet\| \approx 67 \times 10^3$) for $\sim$32k
   distinct authors ($n \approx 32 \times 10^3$)

{\bf Wikipedia edit history data (WIKI)\footnote{\url{https://snap.stanford.edu/data/wiki-meta.html}}:}
   The raw Wikipedia edit log in the main namespace consists of a total of
   $\sim$116.6 million entries, each representing a revision of an article,
   with information of the user ID, article category, edit size, etc.
   We consider users as objects, and a tuple is the (number, size) of
   all edits and minor edits in one day.  For over 3.3 million users
   ($n \approx 3.3 \times 10^6$), we have a total of $\sim$8.91 million
   tuples ($\|\DataSet\| \approx 8.91 \times 10^6$).

\subsection{Quality Evaluation}
\label{sec:expr:quality}
We evaluated the quality of $(\RePreProx, \ReSke)$ in two 
aspects---(i)~quality of $\RePreProx$ w.r.t.\
$\RePre$ in terms of recall and, less importantly, precision, and
(ii)~quality of $\ReSke$ w.r.t.\ $\Result\setminus\ReSke$ as measured
by the sketch distance function $\delta$.

Note that the baseline algorithm (Algorithm \ref{algo:base}) performs
full evaluation, and hence produces the exact $\RePre$.  Also, under
$L_\infty$-norm (rectangular neighborhood), the grid-partition-based
baseline algorithm trivially achieves $\delta(\ReSke, \Result\setminus
  \RePre)=0$.  Therefore, we are only interested in evaluating the
quality of results of the sampling-based algorithm (Algorithm
  \ref{algo:sample}).
  
The same set of experiments were conducted on all three datasets.
We first illustrate observations common to all data sets using
the NBA data set.

\paragraph{Impact of $\Obj^+$ selection strategy and neighborhood
definition.}
We tested the performance of Algorithm~\ref{algo:sample} with
(i)~different strategies for selecting $\Obj^+$ described in
Section~\ref{sec:select:pos} ($\EuScript{M}_p$ with $p=1$, $0$, $-1$,
$-\infty$), and (ii)~different $\|\cdot\|$ functions in the
neighborhood definition (Eq.~\ref{eq:nb-def}), namely $L_1$-, $L_2$-,
and $L_\infty$-norms, corresponding to diamondoid, oval, and
rectangular neighborhood, respectively.  No significant difference was
observed in the quality of result produced by
Algorithm~\ref{algo:sample}.  We omit the figures here due to limited
space.

\paragraph{Varying sample rate $\zeta$.}
Performance of Algorithm~\ref{algo:sample} at different sample rates
$\zeta$ is shown in Figure~\ref{fig:expr-budget-sample}.  Results
suggest that increasing $\zeta$ gives notable improvement in the
quality of $\RePreProx$ in the beginning
(Figures~\ref{fig:expr-budget-sample-prec}
and~\ref{fig:expr-budget-sample-recl}), but the improvement diminishes
afterwards (beyond 5\% for NBA).  On the other hand, $\zeta$ has
almost no impact on the quality of the sketch $\ReSke$ in terms of
distance to the exact solution $\Result \setminus \RePre$
(Figure~\ref{fig:expr-budget-sample-dist}).

\paragraph{Varying budget $\eta^+$ for $\Obj^+$.}
In Figure \ref{fig:expr-budget-eta-pos}, we fix $\zeta$ and $\eta^-$,
and observe the performance of Algorithm~\ref{algo:sample} with
varying budget $\eta^+$.  We see that as $\eta^+$ increases, precision
and recall keep increasing and approach 100\%
(Figures~\ref{fig:expr-budget-eta-pos-prec}
and~\ref{fig:expr-budget-eta-pos-recl}).  Similar to the results for
varying $\zeta$, $\eta^+$ shows little impact on the quality of
$\ReSke$ (Figure~\ref{fig:expr-budget-eta-pos-dist}).

\paragraph{Varying budget $\eta^-$ for $\Obj^-$.}
In contrast to $\zeta$ and $\eta^+$, increasing $\eta^-$ gives big
improvement on the quality of $\ReSke$
(Figure~\ref{fig:expr-budget-eta-neg-dist}), while recall of
$\RePreProx$ is barely affected (Figure~\ref{fig:expr-budget-eta-pos-recl}).
It is worth noting that having too few objects $\Obj^-$ may increase
the amount of false positives in $\RePreProx$, accounting for the
increase in precision shown in Figure~\ref{fig:expr-budget-eta-pos-prec}.

\paragraph{Varying overall budget $\eta$.}
Results above show how $\eta^+$ or $\eta^-$, when the other is fixed,
affects different aspects of result quality.  Now we show the tradeoff
between $\eta^+$ and $\eta^-$ when fixing the overall budget $\eta$.
%
In Figure~\ref{fig:expr-budget-tot}, we fix the total budget $\eta
  = \eta^+ + \eta^- = 20\%$, and show the three quality indicators,
varying the budget allocation between $\eta^+$ and $\eta^-$.  From
the earlier experiments, it is expected that a larger $\eta^+$ would
lead to better precision and recall, while a larger $\eta^-$ would
lead to a better sketch with a smaller distance to the ground truth.

To measure the overall quality of a solution by
Algorithm~\ref{algo:sample}, we use a generalized version of the
F-score, by taking the weighted harmonic mean of the three quality
measure, precision, recall, and distance.  We assign weights
$\beta_P^2$, $\beta_R^2$, $\beta_\delta^2$ to precision, recall, and
$1-\delta$, respectively, where
$\beta_R > \beta_\delta > \beta_P > 0$, signifying decreasing
importance.  Formally,
\begin{equation}
 F_{\beta_P, \beta_R, \beta_\delta} =
   \frac{\beta_P^2 + \beta_R^2 + \beta_\delta^2}
   {\frac{\beta_P^2}{P} + \frac{\beta_R^2}{R} + \frac{\beta_\delta^2}
     {1 - \delta(\ReSke, \Result\setminus \RePre)}}.
\end{equation}

We show results of two F-scores, $F_{1,3,2}$ and $F_{2,4,3}$.  In
Figure~\ref{fig:expr-budget-tot-prj}
and~\ref{fig:expr-budget-tot-cnt}, both $F_{1,3,2}$ and $F_{2,4,3}$
peak at $\eta^+ = 14\%$, leaving $\eta^-$ at $6\%$.  On the contrary,
for streak query (Figure~\ref{fig:expr-budget-tot-stk}), the quality
of $\ReSke$ is still decent even with $\eta^-$ as small as $2\%$,
which allows the majority of the overall budget to be allocated to
$\Obj^+$ in order to improve the quality of $\RePreProx$.
% [[[may want to use some other $\beta_P, \beta_R, \beta_\delta$ that
% shows notable difference from $F_{1,3,2}$]]]

\paragraph{Precision versus recall.}
From the results on NBA, we see a higher precision than recall when
sufficient budget is given.  The same holds for results on DBLP and
WIKI. As we have shown in Section~\ref{sec:select:neg:precise},
$\Result^-$ plays an important role in both eliminating false
positives from and keeping true positives in $\Result^+$.  While the
probability of false positives only depends on the rate at which
objects are sampled from $\Obj \setminus \Obj^+$ to form $\Obj^-$, the
probability of false negatives is conditional on the fact that the
object that yields some point of $\Result^+\cap \RePreProx$ is
included in $\Obj^+$ in the first place.  And we discuss next why it
is hard to ensure the right objects are chosen for $\Obj^+$.

We should note that the effectiveness of $\Obj^+$ selection depends on
both the data distributions and the query type.
Algorithm~\ref{algo:sample} assumes neither.  To see how the data
distribution affects the effectiveness of $\Obj^+$, we compare the
recall of $\RePreProx$ on NBA and on WIKI.  In
Figure~\ref{fig:expr-dblp-sample-unif}, we show the recall of
$\RePreProx$ on WIKI data with varying sample rate.  The result is
significantly worse than that on the NBA data
(Figure~\ref{fig:expr-budget-sample-recl}).  For NBA, the average
number of tuples per player is over 300.  In contrast, the average
number of tuples per author is below 2.  Even the most renowned
researchers have tuples for fewer than 40 years.  The consequence of a
small sample rate is a high probability of missing all tuples of an
author, even for the most prolific ones.  A second set of results
shown in Figure~\ref{fig:expr-dblp-sample-unif} is based on a
different sampling method: sample for each object at a rate
proportional to the size of the object, while the overall sample rate
remains the same.  By adopting this sampling strategy, we implicitly
assume that points of $\RePre$ are likely to be yielded by objects
with more tuples, and we get significant improvement over uniform
sampling.

The other important factor that influences the effectiveness of
$\Obj^+$ selection is the query type.  On the NBA data, we see a
notable difference between the recall $\RePreProx$ for count query and
the other two query types.  Similarly, on the WIKI data, we observe a
notable difference between \emph{streak} query and the other two types
(Figure~\ref{fig:expr-wiki-type}).  For \emph{streak} query, a point
of $\RePre$ comes from a (possibly small) number of consecutive
tuples.  Uniformly sampled tuples may not be representative at all for
the overall quality of an object w.r.t.\ a streak query.  Similarly,
for \emph{projection} query, say for NBA, a generally lousy player
could have one outstanding game that contributes to $\RePre$, which
will unlikely be picked up by a small uniform sample.  On the
contrary, any point in $\Result$ of a \emph{count} query considers all
tuples of an object, making $\Obj^+$ selection less vulnerable to
uniform samples.  Had Algorithm~\ref{algo:sample} known the behavior
of the query function $f$, smarter sampling strategies could be
deployed to better guide the selection of $\Obj^+$.

\subsection{Efficiency}
\label{sec:expr:time}

We evaluate the efficiency of the sampling-based algorithm when
varying various parameters, namely (i)~the object selection strategy,
(ii)~the distance metric ($L_\infty$-/$L_1$-/$L_2$-norm), and
(iii)~the computation budget $\eta$.  The efficiency is measured as
execution-time speed-up over the baseline algorithm
(Algorithm~\ref{algo:base}), which performs full evaluation on all
objects.  Not surprisingly, the object selection strategy and the
distance metric do not influence the efficiency of
Algorithm~\ref{algo:sample}.  Hence, the results are omitted.

\paragraph{Varying $\zeta$, $\eta^+$, and $\eta^-$.}
% In Figure~\ref{fig:expr-time-budget}, we observe a linear increase
% in execution time when increasing the sample rate $\zeta$ (if sample
% data $R_i$'s are obtained on demand instead of being pre-fetched),
% in $\eta^+$ and in $\eta^-$.  Compared to the execution time of the
% baseline algorithm (Algorithm~\ref{algo:base}), the sampling-based
% algorithm roughtly reduces the execution time by a factor of
% $\zeta + \eta$, agreeing with the asymptotic complexities provided
% in Section~\ref{sec:algo}.
Figure~\ref{fig:expr-time-budget} shows the speed-up of the
sampling-based algorithm over the baseline, varying $\zeta$, $\eta^+$,
and $\eta^-$, respectively, with the other two fixed.  The solid
\emph{optimal} curve shows the maximum possible speed-up, i.e.
$(\zeta + \eta^+ + \eta^-)^{-1}$, for linear-time query function $f$,
excluding any overhead during object selection.  Roughly the same
amount of speed-up is observed for all three query types, regardless
of the size and distribution of the input data.  Results on DBLP and
WIKI data are similar and thus omitted due to limited space.

% The integer program in Definition \ref{def:emd} is a special
% case of the Generalized Assignment Problem
% \footnote{\url{http://en.wikipedia.org/wiki/Generalized_assignment_problem}}
% \cite{???}, where the weight and the cost of assigning a point $p$
% of $P$ to a weighted point $q$ of $Q$ are both 1. \footnote{Due to
%   this specification, the integer program can be mapped to an
%   instnace of the max flow problem, which can be solve in polynomial
%   time, while the general Generalized Assignment Problem has been
%   proven to be NP-hard.}  Constraint \ref{eqn:ip-req1} states that
% at most $w_i$ points are assigned to $q_i$.  Constraint
% \ref{eqn:ip-req2} states that each point of $P$ is assigned to at
% most one point of $Q$.  Moreover, Constraint \ref{eqn:ip-req3}
% dictates that if point $p_j$ is not in the neighborhood of $q_i$,
%   $x_{ij}$ is restricted to 0, i.e. $p_j$ never gets assigned to
% $q_i$. 













% \clearpage
% 
% % DBLP figures
% % uniform samples
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-prune-prec-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/prune_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-prune-recl-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/prune_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-prune-dist-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/prune_dist.eps}}
%  \caption{Vary object selection strategies.  Fix $\zeta = .05$,
%    $\eta^+ = \eta^- = .1$. (DBLP, uniform sample)}
%  \label{fig:expr-prune-dblp-unif}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-norm-prec-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/norm_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-norm-recl-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/norm_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-norm-dist-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/norm_dist.eps}}
%  \caption{Vary L-norms in neighborhood definition.  Fix $\zeta = .05$,
%    $\eta^+ = \eta^- = .1$. (DBLP, uniform sample)}
%  \label{fig:expr-norm-dblp-unif}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-sample-prec-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_sample_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-sample-recl-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_sample_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-sample-dist-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_sample_dist.eps}}
%  \caption{Vary sample rate $\zeta$ between 1-10\%.  Fix
%    $\eta^+ = \eta^- = .1$. (DBLP, uniform sample)}
%  \label{fig:expr-budget-sample-dblp-unif}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-eta-pos-prec-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_eta_pos_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-eta-pos-recl-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_eta_pos_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-eta-pos-dist-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_eta_pos_dist.eps}}
%  \caption{Vary budget $\eta^+$ for $\Obj^+$ between 2-20\%.  Fix
%    $\zeta = .05$, $\eta^- = .1$. (DBLP, uniform sample)}
%  \label{fig:expr-budget-eta-pos-dblp-unif}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-eta-neg-prec-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_eta_neg_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-eta-neg-recl-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_eta_neg_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-eta-neg-dist-dblp-unif}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/unif_smpl/budget_eta_neg_dist.eps}}
%  \caption{Vary budget $\eta^-$ for $\Obj^-$ between 2-20\%.  Fix
%    $\zeta = .05$, $\eta^+ = .1$. (DBLP, uniform sample)}
%  \label{fig:expr-budget-eta-neg-dblp-unif}
% \end{figure*}
% 
% % sample rate for each object proportional to its size in # tuples
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-prune-prec-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/prune_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-prune-recl-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/prune_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-prune-dist-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/prune_dist.eps}}
%  \caption{Vary object selection strategies.  Fix $\zeta = .05$,
%    $\eta^+ = \eta^- = .1$. (DBLP, sample rate prop. to objects' data size)}
%  \label{fig:expr-prune-dblp-prop}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-norm-prec-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/norm_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-norm-recl-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/norm_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-norm-dist-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/norm_dist.eps}}
%  \caption{Vary L-norms in neighborhood definition.  Fix $\zeta = .05$,
%    $\eta^+ = \eta^- = .1$. (DBLP, sample rate prop. to objects' data size)}
%  \label{fig:expr-norm-dblp-prop}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-sample-prec-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_sample_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-sample-recl-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_sample_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-sample-dist-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_sample_dist.eps}}
%  \caption{Vary sample rate $\zeta$ between 1-10\%.  Fix
%    $\eta^+ = \eta^- = .1$. (DBLP, sample rate prop. to objects' data size)}
%  \label{fig:expr-budget-sample-dblp-prop}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-eta-pos-prec-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_eta_pos_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-eta-pos-recl-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_eta_pos_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-eta-pos-dist-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_eta_pos_dist.eps}}
%  \caption{Vary budget $\eta^+$ for $\Obj^+$ between 2-20\%.  Fix
%    $\zeta = .05$, $\eta^- = .1$. (DBLP, sample rate prop. to objects' data size)}
%  \label{fig:expr-budget-eta-pos-dblp-prop}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-eta-neg-prec-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_eta_neg_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-eta-neg-recl-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_eta_neg_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-eta-neg-dist-dblp-prop}
%    \includegraphics[width=0.3\textwidth]{figs/dblp/prop_smpl/budget_eta_neg_dist.eps}}
%  \caption{Vary budget $\eta^-$ for $\Obj^-$ between 2-20\%.  Fix
%    $\zeta = .05$, $\eta^+ = .1$. (DBLP, sample rate prop. to objects' data size)}
%  \label{fig:expr-budget-eta-neg-dblp-prop}
% \end{figure*}
% 
% % \clearpage
% 
% % Wikipedia figures
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-prune-prec-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/prune_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-prune-recl-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/prune_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-prune-dist-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/prune_dist.eps}}
%  \caption{Vary object selection strategies.  Fix $\zeta = .05$,
%    $\eta^+ = \eta^- = .1$. (WIKI)}
%  \label{fig:expr-prune-wiki}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-norm-prec-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/norm_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-norm-recl-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/norm_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-norm-dist-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/norm_dist.eps}}
%  \caption{Vary L-norms in neighborhood definition.  Fix $\zeta = .05$,
%    $\eta^+ = \eta^- = .1$. (WIKI)}
%  \label{fig:expr-norm-wiki}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-sample-prec-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_sample_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-sample-recl-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_sample_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-sample-dist-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_sample_dist.eps}}
%  \caption{Vary sample rate $\zeta$ between 1-10\%.  Fix
%    $\eta^+ = \eta^- = .1$. (WIKI)}
%  \label{fig:expr-budget-sample-wiki}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-eta-pos-prec-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_eta_pos_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-eta-pos-recl-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_eta_pos_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-eta-pos-dist-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_eta_pos_dist.eps}}
%  \caption{Vary budget $\eta^+$ for $\Obj^+$ between 2-20\%.  Fix
%    $\zeta = .05$, $\eta^- = .1$. (WIKI)}
%  \label{fig:expr-budget-eta-pos-wiki}
% \end{figure*}
% 
% \begin{figure*}[p]
%  \centering
%  \subfloat[Precision]{\label{fig:expr-budget-eta-neg-prec-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_eta_neg_prec.eps}}
%  \subfloat[Recall]{\label{fig:expr-budget-eta-neg-recl-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_eta_neg_recl.eps}}
%  \subfloat[Sketch Distance]{\label{fig:expr-budget-eta-neg-dist-wiki}
%    \includegraphics[width=0.3\textwidth]{figs/wiki/budget_eta_neg_dist.eps}}
%  \caption{Vary budget $\eta^-$ for $\Obj^-$ between 2-20\%.  Fix
%    $\zeta = .05$, $\eta^+ = .1$. (WIKI)}
%  \label{fig:expr-budget-eta-neg-wiki}
% \end{figure*}
